{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import requests\n",
    "import json\n",
    "import regex as re\n",
    "from glob import glob\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "query=\"\"\"\n",
    "SELECT ?itemLabel \n",
    "WHERE {\n",
    "  ?item wdt:P31 wd:Q5 . #instance of human\n",
    "        ?item wdt:P106/wdt:P279 wd:Q639669 . #occupation a subclass of musician\n",
    "  SERVICE wikibase:label {\n",
    "    bd:serviceParam wikibase:language \"en\" .\n",
    "   }\n",
    "}\n",
    "\n",
    "LIMIT 10\n",
    "\"\"\"\n",
    "\n",
    "QID_REGEX=re.compile(r'^Q[0-9]+$')\n",
    "\n",
    "\n",
    "url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'\n",
    "def get_query(queryfile):\n",
    "    query = \"\"\n",
    "    with open(queryfile) as fp:\n",
    "        query = fp.read().strip()\n",
    "    return query\n",
    "\n",
    "def download_data(query, data_format=\"json\"):\n",
    "    if data_format == \"xml\":\n",
    "        data = requests.get(url, params={'query': query, 'format': 'xml'})\n",
    "    else:\n",
    "        data = requests.get(url, params={'query': query, 'format': 'json'}).json()\n",
    "        print \"Downloaded %s records\" % (len(data[\"results\"][\"bindings\"]))\n",
    "    return data\n",
    "\n",
    "def save_data(data, outfile, label=\"itemLabel\", data_format=\"json\"):\n",
    "    with open(outfile, \"wb+\") as fp:\n",
    "        processed, written = 0, 0\n",
    "        if data_format == \"xml\":\n",
    "            from bs4 import BeautifulSoup\n",
    "            data_xml = BeautifulSoup(data.text, \"lxml\")\n",
    "            results = data_xml.find_all(\"result\")\n",
    "        else:\n",
    "            results = data[\"results\"][\"bindings\"]\n",
    "            \n",
    "        for k in results:\n",
    "            if data_format == \"xml\":\n",
    "                line = k.text.strip()\n",
    "            else:\n",
    "                line=k[label][\"value\"]\n",
    "            try:                \n",
    "                processed += 1\n",
    "                if QID_REGEX.match(line):\n",
    "                    continue\n",
    "                print >> fp, line\n",
    "                written += 1\n",
    "            except:\n",
    "                continue\n",
    "    print \"Processed: %s, Written: %s\" % (processed, written)\n",
    "    return True\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloaded 10 records\n"
     ]
    }
   ],
   "source": [
    "data = download_data(query=query)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processed: 10, Written: 9\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "save_data(data, \"data/wikidata/downloaded/temp.txt\", label=\"itemLabel\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['data/wikidata/queries/persons.txt', 'data/wikidata/queries/cities.txt', 'data/wikidata/queries/sportsteams.txt', 'data/wikidata/queries/movies.txt', 'data/wikidata/queries/music_artists.txt', 'data/wikidata/queries/products.txt', 'data/wikidata/queries/tvshows.txt']\n"
     ]
    }
   ],
   "source": [
    "query_files = glob(\"data/wikidata/queries/*.txt\")\n",
    "print query_files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "OUTDIR=\"./data/wikidata/downloaded/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing data/wikidata/queries/companynames.txt\n",
      "Downloaded 101206 records\n",
      "Saving to ./data/wikidata/downloaded//companynames.txt.results.txt\n",
      "Processed: 101206, Written: 68321\n"
     ]
    }
   ],
   "source": [
    "for q in [\"data/wikidata/queries/companynames.txt\"]:\n",
    "    print \"Processing %s\" % q\n",
    "    if \"persons\" in q or \"movies\" in q:\n",
    "        data_format=\"xml\"\n",
    "    else:\n",
    "        data_format=\"json\"\n",
    "    base_file=q.split(\"/\")[-1]\n",
    "    outfile=\"%s/%s.results.txt\" % (OUTDIR, base_file)\n",
    "    query = get_query(q)\n",
    "    data = download_data(query=query, data_format=data_format)\n",
    "    print \"Saving to %s\" % outfile\n",
    "    save_data(data, outfile, label=\"itemLabel\", data_format=data_format)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing data/wikidata/queries/buildings.txt\n",
      "Downloaded 96624 records\n",
      "Saving to ./data/wikidata/downloaded//buildings.txt.results.txt\n",
      "Processed: 96624, Written: 70490\n"
     ]
    }
   ],
   "source": [
    "for q in [\"data/wikidata/queries/buildings.txt\"]:\n",
    "    print \"Processing %s\" % q\n",
    "    if \"persons\" in q or \"movies\" in q:\n",
    "        data_format=\"xml\"\n",
    "    else:\n",
    "        data_format=\"json\"\n",
    "    base_file=q.split(\"/\")[-1]\n",
    "    outfile=\"%s/%s.results.txt\" % (OUTDIR, base_file)\n",
    "    query = get_query(q)\n",
    "    data = download_data(query=query, data_format=data_format)\n",
    "    print \"Saving to %s\" % outfile\n",
    "    save_data(data, outfile, label=\"itemLabel\", data_format=data_format)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing data/wikidata/queries/persons.txt\n",
      "Saving to ./data/wikidata/downloaded//persons.txt.results.txt\n",
      "Processed: 48164, Written: 41824\n",
      "Processing data/wikidata/queries/cities.txt\n",
      "Downloaded 20347 records\n",
      "Saving to ./data/wikidata/downloaded//cities.txt.results.txt\n",
      "Processed: 20347, Written: 17930\n",
      "Processing data/wikidata/queries/sportsteams.txt\n",
      "Downloaded 2549 records\n",
      "Saving to ./data/wikidata/downloaded//sportsteams.txt.results.txt\n",
      "Processed: 2549, Written: 2328\n",
      "Processing data/wikidata/queries/movies.txt\n",
      "Saving to ./data/wikidata/downloaded//movies.txt.results.txt\n",
      "Processed: 187738, Written: 110628\n",
      "Processing data/wikidata/queries/music_artists.txt\n",
      "Downloaded 80438 records\n",
      "Saving to ./data/wikidata/downloaded//music_artists.txt.results.txt\n",
      "Processed: 80438, Written: 63904\n",
      "Processing data/wikidata/queries/products.txt\n",
      "Downloaded 33 records\n",
      "Saving to ./data/wikidata/downloaded//products.txt.results.txt\n",
      "Processed: 33, Written: 32\n",
      "Processing data/wikidata/queries/tvshows.txt\n",
      "Downloaded 20731 records\n",
      "Saving to ./data/wikidata/downloaded//tvshows.txt.results.txt\n",
      "Processed: 20731, Written: 8161\n"
     ]
    }
   ],
   "source": [
    "for q in query_files:\n",
    "    print \"Processing %s\" % q\n",
    "    if \"persons\" in q or \"movies\" in q:\n",
    "        data_format=\"xml\"\n",
    "    else:\n",
    "        data_format=\"json\"\n",
    "    base_file=q.split(\"/\")[-1]\n",
    "    outfile=\"%s/%s.results.txt\" % (OUTDIR, base_file)\n",
    "    query = get_query(q)\n",
    "    data = download_data(query=query, data_format=data_format)\n",
    "    print \"Saving to %s\" % outfile\n",
    "    save_data(data, outfile, label=\"itemLabel\", data_format=data_format)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'SELECT ?itemLabel WHERE {\\n  ?item wdt:P31 wd:Q5.\\n  SERVICE wikibase:label { bd:serviceParam wikibase:language \"en\". }\\n}'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "query"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data = requests.get(url, params={'query': query, 'format': 'xml'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processed: 47079, Written: 40913\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "save_data(data, outfile=outfile, data_format=\"xml\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
